library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
library(stats)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(glue)
library(imputeTS)
library(ggfortify)
## Registered S3 methods overwritten by 'ggfortify':
## method from
## autoplot.Arima forecast
## autoplot.acf forecast
## autoplot.ar forecast
## autoplot.bats forecast
## autoplot.decomposed.ts forecast
## autoplot.ets forecast
## autoplot.forecast forecast
## autoplot.stl forecast
## autoplot.ts forecast
## fitted.ar forecast
## fortify.ts forecast
## residuals.ar forecast
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rlang':
##
## flatten, unbox
library(tidyr)
library(zoo)
##
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
##
## na.locf
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(corrplot)
## corrplot 0.92 loaded
library(readr)
library(tibbletime)
##
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
##
## filter
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:rlang':
##
## set_names
# Load the dataset
traffic_data <- read.csv("traffic_volume.csv")
# Select the columns "date_time" and "traffic_volume"
traffic_data <- traffic_data %>%
select(date_time, traffic_volume)
# Convert "date_time" to POSIXct format
traffic_data$date_time <- ymd_hms(traffic_data$date_time)
# Remove incomplete calendar years (2012 and 2018)
traffic_data <- traffic_data %>%
filter(!(year(date_time) %in% c(2012, 2018)))
# Printing the traffic volume for the first and last year.
year_2013_data <- traffic_data %>%
filter(year(date_time) == 2013)
year_2017_data <- traffic_data %>%
filter(year(date_time) == 2017)
# Print the traffic volume for the first and last year
cat("2013: ",head(year_2013_data$traffic_volume, 5),"\n")
## 2013: 1439 1502 933 576 372
cat("2017: ",head(year_2017_data$traffic_volume, 5))
## 2017: 1848 1806 1211 794 500
# Remove the 29th of February in leap years
traffic_data <- traffic_data %>%
filter(!(month(date_time) == 2 & day(date_time) == 29))
# Verify that the number of rows for each calendar year is 365
year_counts <- traffic_data %>%
group_by(year(date_time))%>%
summarise(row_count = n()/ (24))
year_counts
# Plot the data with the whole data on the x-axis
ggplot(traffic_data, aes(x = date_time, y = traffic_volume)) +
geom_line() +
scale_x_datetime(date_breaks = "1 year", date_labels = "%b %Y") +
labs(x = "Date", y = "Traffic Volume") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle(glue("Plotting the whole data"))
# Plot the data with the all 2013 data on the x-axis
ggplot(year_2013_data, aes(x = date_time, y = traffic_volume)) +
geom_line() +
scale_x_datetime(date_breaks = "1 month", date_labels = "%b %Y") +
labs(x = "Date", y = "Traffic Volume") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle(glue("Plotting all of 2013 data"))
# Plot the data of February in 2013 on the x-axis
data = year_2013_data %>%
filter((month(date_time) == 2))
ggplot(data, aes(x = date_time, y = traffic_volume)) +
geom_line() +
scale_x_datetime(date_breaks = "1 day", date_labels = "%b %Y") +
labs(x = "Date", y = "Traffic Volume") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle(glue("Plotting february in 2013"))
# Aggregate data at daily level
daily_aggregated <- traffic_data %>%
group_by(date = date(date_time)) %>%
summarise(mean_volume = mean(traffic_volume),
std_dev_volume = sd(traffic_volume),
min_volume = min(traffic_volume),
max_volume = max(traffic_volume))
# Aggregate data at weekly level
weekly_aggregated <- traffic_data %>%
group_by(week = lubridate::floor_date(date_time, "week")) %>%
summarise(mean_volume = mean(traffic_volume),
std_dev_volume = sd(traffic_volume),
min_volume = min(traffic_volume),
max_volume = max(traffic_volume))
# Aggregate data at monthly level
monthly_aggregated <- traffic_data %>%
group_by(month = lubridate::floor_date(date_time, "month")) %>%
summarise(mean_volume = mean(traffic_volume),
std_dev_volume = sd(traffic_volume),
min_volume = min(traffic_volume),
max_volume = max(traffic_volume))
# Aggregate data at yearly level
yearly_aggregated <- traffic_data %>%
group_by(year = year(date_time)) %>%
summarise(mean_volume = mean(traffic_volume),
std_dev_volume = sd(traffic_volume),
min_volume = min(traffic_volume),
max_volume = max(traffic_volume))
# Function to create a line plot with standard deviations, min, and max
plot_with_stats <- function(data, x_var, y_var, title) {
ggplot(data, aes(x = !!sym(x_var), y = !!sym(y_var))) +
geom_line(aes(color = "Mean")) +
geom_ribbon(aes(ymin = !!sym(y_var) - std_dev_volume, ymax = !!sym(y_var) + std_dev_volume, fill = "Std. Dev."), alpha = 0.2) +
geom_line(aes(y = min_volume, color = "Min")) +
geom_line(aes(y = max_volume, color = "Max")) +
labs(x = x_var, y = y_var, title = title) +
theme_minimal() +
scale_color_manual(values = c("Mean" = "black", "Min" = "blue", "Max" = "red")) +
scale_fill_manual(values = c("Std. Dev." = "gray"))
}
# Create and display plots
plot_with_stats(daily_aggregated, "date", "mean_volume", "Daily Traffic Volume")
plot_with_stats(weekly_aggregated, "week", "mean_volume", "Weekly Traffic Volume")
plot_with_stats(monthly_aggregated, "month", "mean_volume", "Monthly Traffic Volume")
plot_with_stats(yearly_aggregated, "year", "mean_volume", "Yearly Traffic Volume")
# Define a vector of frequencies
frequencies <- c(7, 24, 24*7, 24*12, 24*24)
freq_hour <- c(24*7, 24*12, 24*365)
# Create a function for STL decomposition and plotting
perform_stl_and_plot <- function(data, interval, frequency, sub_interval = NULL) {
if (identical(data, traffic_data)) {
stl_result <- stl(ts(data$traffic_volume, frequency = frequency), s.window = "periodic")
} else {
stl_result <- stl(ts(data$mean_volume, frequency = frequency), s.window = 'periodic')
}
# Plot the decompositions
plot(stl_result, main = paste("STL Decomposition -", interval, "-", frequency))
}
# hourly aggregated data
for (i in freq_hour){
perform_stl_and_plot(traffic_data, "Hourly",frequency = i)}
# Daily aggregated data
for (i in frequencies){
perform_stl_and_plot(daily_aggregated, "Daily", frequency = i)}
# Weekly aggregated data
for (i in frequencies[1:2]){
perform_stl_and_plot(weekly_aggregated, "Weekly", frequency = i)}
# Monthly aggregated data
for (i in frequencies[1:2]){
perform_stl_and_plot(monthly_aggregated, "Monthly", frequency = i)}
We can see that the seasonal component consists of three main parts: seasonality, trend, and remainder. When we combine these three components, they make up the entire dataset. The seasonality specifically depicts the fluctuations in the data over different time periods, revealing a consistent pattern in the data. The trend represents the overall movement or direction of the data, showing consistent upward and downward trends. The remainder is calculated by subtracting the values of the seasonal and trend components from the time series. In our case, we observe that the remainder contains a consistent level of noise, except for the middle portion where the noise is smaller than in the other areas.
When we aggregate by mean our data can be hugely effected on the outlier in the data. Outliers that has very high values or very low. Median would be more reasonable in the consideration to extreme values (outliers).
The paradigm “more data is better” may not always hold when investigating monthly trends. The relevance, quality, and changing nature of the data should be considered to make informed decisions about adding more data to the analysis.
When investigating monthly data, the presence of older data can adversely affect the analysis of monthly trends. Trends typically follow the movement of the data, which may involve upward or downward movement. This can complicate the analysis of monthly trends when using historical data. Older data may become less relevant, particularly for months that are farther along in the time series.
Simple STL decomposition is designed to handle time series data with
a single dominant season. It may not be the best choice for data with
multiple seasonal patterns, as it primarily focuses on extracting a
single season and trend. When you have multiple seasonal components or
complex seasonality, it makes
sense to use a more advanced decomposition method.
# load dataset
rom = read.csv('romanian_energy.csv')
# convert the DateTime column to the appropriate time series object
rom <- rom %>%
mutate(DateTime = ymd_hms(DateTime))
ggplot(rom, aes(x = DateTime, y = Consumption.Missing)) + geom_line()
ggplot_na_gapsize(rom$Consumption.Missing)
ggplot_na_distribution(rom$Consumption.Missing)
## Task a)
ts_data <- ts(rom$Consumption.Full, frequency = 1)
# Create a vector of different lag.max values to explore
lag_max_values <- c(100, 1000, 36000)
# Create subplots for ACF with different lag.max values
par(mfrow=c(1, length(lag_max_values)))
for (lag_max in lag_max_values) {
acf(ts_data, lag.max = lag_max, main = paste("ACF with lag.max = ", lag_max))
}
# Compute the MSE between your fit and the ground truth
MSE <- function(y_true, y_pred){
return(
mean((y_true - y_pred)^2)
)
}
# mutate global, locf and seadec
df_imp = rom %>%
mutate(global = na_mean(Consumption.Missing),
locf = na_locf(Consumption.Missing),
seadec = na_seadec(ts(Consumption.Missing,
frequency = 365), algorithm = "interpolation")
)
imp <- c('global', 'locf', 'seadec')
mse <- c()
for (i in 1:length(imp)){
mse[imp[i]]<- MSE(df_imp$Consumption.Full, df_imp[[imp[i]]])}
plot_imputation <- function(imp_var){
p <- ggplot_na_imputations(x_with_na = df_imp$Consumption.Missing,
x_with_imputations = df_imp[[imp_var]],
x_with_truth = df_imp$Consumption.Full,
size_points = 0.4,
size_truth = 0.4,
size_lines = 0.5,
size_imputations = 1.02
) +
ggtitle(glue("Imputation of {imp_var}"))
print(p)
}
for (i in 1:length(imp)){
plot_imputation(imp[i])}
cat("MSE","\n")
## MSE
mse
## global locf seadec
## 156227.4 482024.2 302835.6
Yes, there are some issues. The global method does not capture the underlying trend or seasonality but merely provides the mean value, which may not accurately represent the data. Locf takes the last available value and carries it forward through the gap, which can lead to poor performance in cases with large gaps. Seadec is an improvement over locf, but it still deviates from the ground truth. Overall, the “global” method performs best when considering the mean squared error values.
# Example seasonal periods, replace with your findings
seasonal_periods <- c(24, 24*7,24*365)
# Convert 'Consumption.Full' into a multiple seasonal time series object using 'msts'
ts_full <- msts(rom$Consumption.Full, seasonal.periods = seasonal_periods)
# Decompose the time series using 'mstl'
decomposition_full <- mstl(ts_full)
autoplot(decomposition_full, main = "Decomposition of Consumption.Full")
# Convert 'Consumption.Missing' into a multiple seasonal time series object using 'msts'
ts_missing <- msts(rom$Consumption.Missing, seasonal.periods = seasonal_periods)
# Decompose the time series using 'mstl'
decomposition_missing <- mstl(ts_missing)
miss = decomposition_missing
# Replace NA values with NaN in the decomposition components
miss[is.na(miss)] <- NaN
# Check for rows containing at least one NaN value
rows_with_na <- apply(is.na(miss), 1, any)
# Replace entire rows with NaN if at least one NaN is found in the row
miss[rows_with_na, ] <- NaN
# Plot both decompositions
autoplot(miss, main="Decomposition of Consumption.Missing" )
# Take into account all reasonable seasonal periods detected in (a)
# creating a function to merge relevant columns together
merging_miss_full <- function(name, number){
df = data.frame(cbind("full"=decomposition_full[ , number]
, "miss"=decomposition_missing[, number]))
return(df)
}
# running the function for the different columns
trend = merging_miss_full("trend", 2)
seas_24 = merging_miss_full("seasonal24", 3)
seas_168 = merging_miss_full("seasonal168", 4)
seas_8760 = merging_miss_full("seasonal8760", 5)
remainder = merging_miss_full("remainder", 6)
# Initialize a vector to store MSE values for each component
mse_values <- c()
# creating a imputing function
imputing <- function(df){
df_1 <- subset(df, select = -1)
df_1 <- df_1 %>%
mutate(imp =na_seasplit(ts(df, frequency = 365),
algorithm = "interpolation",
find_frequency = FALSE))
return(df_1)
}
# running the function for the different columns
trend_imp = imputing(trend)
seas_24_imp = imputing(seas_24)
seas_168_imp = imputing(seas_168)
seas_8760_imp = imputing(seas_8760)
remainder_imp = imputing(remainder)
# Calculating the mse for the different columns
mse_values["trend"] <- MSE(trend$full, trend_imp$imp)
mse_values["seas_24"] <- MSE(seas_24$full, seas_24_imp$imp)
mse_values["seas_168"] <- MSE(seas_168$full, seas_168_imp$imp)
mse_values["seas_8760"] <- MSE(seas_8760$full, seas_8760_imp$imp)
mse_values["remainder"] <- MSE(remainder$full, remainder_imp$imp)
cat("MSE","\n")
## MSE
mse_values
## trend seas_24 seas_168 seas_8760 remainder
## 820.9013 4093.5715 549.8578 1742.7732 6727.5536
# Define a function to import JSON files and return a data frame
import_json <- function(file) {
json_data <- fromJSON(file)
df <- as.data.frame(json_data)
return(df)
}
# Load each JSON file
world_df <- import_json("surface_temp/world.json")
tropics_df <- import_json("surface_temp/tropics.json")
sh_df <- import_json("surface_temp/sh.json")
nh_df <- import_json("surface_temp/nh.json")
arctic_df <- import_json("surface_temp/arctic.json")
antarctic_df <- import_json("surface_temp/antarctic.json")
# Explore the head and tail of the data frame
head(world_df, 3)
tail(world_df, 5)
We can see the tail of the json files can be removed, the last three rows. These last 3 rows are not are not completely necessary for the data frame.
# Define a function to preprocess a dataframe
preprocess_one <- function(df) {
# Unnest the "data" column to get the temperature values
df <- df %>%
unnest(cols = c(data)) %>%
rename(temperature = data)
# Create a "date" column with date values for each day of the year
df$date <- as.Date(paste("1979", 1, 1, sep = "-")) + (seq_along(df$temperature) - 1)
# Remove the "name" column
df <- df[, !(names(df) %in% c("name"))]
# Filter the data to keep rows between 1979 and 2022
df <- df[df$date >= as.Date("1979-01-01") & df$date <= as.Date("2022-12-31"), ]
# Reorder columns with "date" on the left side
df <- df[, c("date", setdiff(names(df), "date"))]
# Define a function to remove February 29th (year-02-29) from the data frame
df <- df[!(format(df$date, "%m-%d") == "02-29"), ]
return(df)
}
# Preprocess all the data frames
preprocessed_world_df <- preprocess_one(world_df)
preprocessed_tropics_df <- preprocess_one(tropics_df)
preprocessed_sh_df <- preprocess_one(sh_df)
preprocessed_nh_df <- preprocess_one(nh_df)
preprocessed_arctic_df <- preprocess_one(arctic_df)
preprocessed_antarctic_df <- preprocess_one(antarctic_df)
# Print the head and tail of one of the preprocessed data frames (e.g., preprocessed_world_df)
print(head(preprocessed_world_df, 5))
## # A tibble: 5 × 2
## date temperature
## <date> <dbl>
## 1 1979-01-01 12.3
## 2 1979-01-02 12.3
## 3 1979-01-03 12.3
## 4 1979-01-04 12.2
## 5 1979-01-05 12.3
print(tail(preprocessed_world_df, 5))
## # A tibble: 5 × 2
## date temperature
## <date> <dbl>
## 1 2022-12-27 13.5
## 2 2022-12-28 13.5
## 3 2022-12-29 13.4
## 4 2022-12-30 13.4
## 5 2022-12-31 13.3
# Define a function to merge the preprocessed dataframes
merge_dataset <- function(world, tropics, sh, nh, arctic, antarctic) {
# Merge the dataframes sequentially based on the "date" column
df_merged <- merge(world, tropics, by = "date", all = TRUE)
df_merged <- merge(df_merged, sh, by = "date", all = TRUE)
df_merged <- merge(df_merged, nh, by = "date", all = TRUE)
df_merged <- merge(df_merged, arctic, by = "date", all = TRUE)
df_merged <- merge(df_merged, antarctic, by = "date", all = TRUE)
# Rename the columns to match the specified names
colnames(df_merged) <- c("date", "world", "tropics", "sh", "nh", "arctic", "antarctic")
# Return the merged dataframe
return(df_merged)
}
# Call the merge_dataset function with the preprocessed dataframes
df_1 <- merge_dataset(preprocessed_world_df, preprocessed_tropics_df, preprocessed_sh_df,
preprocessed_nh_df, preprocessed_arctic_df, preprocessed_antarctic_df)
## Warning in merge.data.frame(df_merged, nh, by = "date", all = TRUE): column
## names 'temperature.x', 'temperature.y' are duplicated in the result
## Warning in merge.data.frame(df_merged, arctic, by = "date", all = TRUE): column
## names 'temperature.x', 'temperature.y' are duplicated in the result
## Warning in merge.data.frame(df_merged, antarctic, by = "date", all = TRUE):
## column names 'temperature.x', 'temperature.y', 'temperature.x', 'temperature.y'
## are duplicated in the result
df_imputed <- df_1
df_imputed[, -1] <- na.approx(df_imputed[, -1])
# Print the head of df_1 to check the merged dataframe
head(df_1)
# Extract the year from the date column and
# add it as a new column to the dataframe
dfplot_1 = df_imputed # Made a copy
dfplot_1$year <- format(dfplot_1$date, "%Y")
# Plot the data for each region individually, colored by year
ggplot(dfplot_1, aes(x = yday(date), y = world, color = as.factor(year))) +
geom_line(alpha = 0.5, linewidth = 0.5) +
labs(title = 'World', x = 'Day of the Year',
y = 'Temperature', color = 'Year') +
theme_minimal()
ggplot(dfplot_1, aes(x = yday(date), y = tropics, color = as.factor(year))) +
geom_line(alpha = 0.5, linewidth = 0.5) +
labs(title = 'Tropics', x = 'Day of the Year',
y = 'Temperature', color = 'Year') +
theme_minimal()
ggplot(dfplot_1, aes(x = yday(date), y = sh, color = as.factor(year))) +
geom_line(alpha = 0.5, linewidth = 0.5) +
labs(title = 'Southern Hemisphere', x = 'Day of the Year',
y = 'Temperature', color = 'Year') +
theme_minimal()
ggplot(dfplot_1, aes(x = yday(date), y = nh, color = as.factor(year))) +
geom_line(alpha = 0.5, linewidth = 0.5) +
labs(title = 'Northern Hemisphere', x = 'Day of the Year',
y = 'Temperature', color = 'Year') +
theme_minimal()
ggplot(dfplot_1, aes(x = yday(date), y = arctic, color = as.factor(year))) +
geom_line(alpha = 0.5, linewidth = 0.5) +
labs(title = 'Arctic', x = 'Day of the Year',
y = 'Temperature', color = 'Year') +
theme_minimal()
ggplot(dfplot_1, aes(x = yday(date), y = antarctic, color = as.factor(year))) +
geom_line(alpha = 0.5, linewidth = 0.5) +
labs(title = 'Antarctic', x = 'Day of the Year',
y = 'Temperature', color = 'Year') +
theme_minimal()
# Compute correlation matrix
cor_matrix <- cor(df_imputed[, -1],
use = "complete.obs") # Exclude the date column
# Generate correlation plot
corrplot(cor_matrix, method = "number", type = "upper",
order = "hclust", tl.col = "black", tl.srt = 45)
# Perform PCA
pca <- prcomp(df_imputed[, -1], scale. = TRUE,
na.action = na.omit) # Exclude the date column
## Warning: In prcomp.default(df_imputed[, -1], scale. = TRUE, na.action = na.omit) :
## extra argument 'na.action' will be disregarded
# Plot explained variances
screeplot(pca, type = "l", col = "blue",
main = "Explained Variances by Principal Components")
# PCA Scores Plot using Base R
biplot(pca, col = c("red", "blue"), cex = 0.75, main = "PCA Biplot")
# Extracting scores
scores <- as.data.frame(pca$x)
# Plotting scores
ggplot(scores, aes(x = PC1, y = PC2)) +
geom_point(aes(color = PC1), size = 3) +
labs(title = "PCA Scores Plot", x = "PC1", y = "PC2") +
theme_minimal()
# Print the summary of the PCA
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.1756 1.0300 0.38231 0.20983 0.12410 0.0004078
## Proportion of Variance 0.7889 0.1768 0.02436 0.00734 0.00257 0.0000000
## Cumulative Proportion 0.7889 0.9657 0.99010 0.99743 1.00000 1.0000000
The temperature patterns are contrasted between the regions antarctic & sh and arctic, nh & world. These two parts are inverse to one another. This makes sense, because the seasons differ between the regions. The tropics do not have a strong correlation with the other regions. The temperature pattern for world is highly affected by the other regions as arctic and nh, therefore its in the right side of the PC1.
From the PCA biplot, we can see that regions as sh & antarctic and arctic & nh are alike and also contrast each other. While the tropics are not correlated to either of the other regions. The PCA scores plot shows us the variance of the PC1 with consideration to the PC2.
\(s.window\): This parameter should be chosen by considering the nature of the seasonal pattern in the data, ensuring it’s large enough to smooth out noise but not blur distinctions between different periods.
\(t.window\): It needs to be large enough to filter out cyclical and irregular fluctuations, but not so large that it lacks responsiveness to genuine changes in the trend. Typically, a minimum of 3 is used.
The requirement for \(s.window\) and \(t.window\) to be odd and greater than 0 is primarily due to the way moving averages and local smoothing are applied to estimate the seasonal and trend components. Using an odd window size ensures that the local average is symmetrically calculated around the point of interest. For example, if \(s.window = 3\), one observation before and one after the point of interest are used for averaging, maintaining symmetry.
\(s.window\) and \(t.window\) represent the number of periods used for local averaging, and they must be positive integers to make logical sense. A window size of 0 or negative would not allow for any meaningful computation of averages or smoothing.
compute_components <- function(xt, p, s_window, t_window) {
n <- length(xt)
st <- numeric(n) # Seasonal component
tau_t <- numeric(n) # Trend component
rt <- numeric(n) # Remainder component
w <- (s_window - 1) / 2
v <- (t_window - 1) / 2
# Seasonal component
for (t in 1:n) {
Wt <- ((t - w * p):(t + w * p)) %% n + 1
st[t] <- mean(xt[Wt], na.rm = TRUE)
}
yt <- xt - st
# Trend component
for (t in 1:n) {
Vt <- ((t - v):(t + v)) %% n + 1
tau_t[t] <- mean(yt[Vt], na.rm = TRUE)
}
# Remainder component
rt <- yt - tau_t
return(list(seasonal = st, trend = tau_t, remainder = rt))
}
compute_components <- function(xt, p, s_window, t_window) {
n <- length(xt)
st <- numeric(n) # Seasonal component
tau_t <- numeric(n) # Trend component
rt <- numeric(n) # Remainder component
w <- (s_window - 1) / 2
v <- (t_window - 1) / 2
# Seasonal component
for (t in 1:n) {
Wt <- seq(max(1, t - w * p), min(n, t + w * p))
st[t] <- mean(xt[Wt], na.rm = TRUE)
}
yt <- xt - st
# Trend component
for (t in 1:n) {
Vt <- seq(max(1, t - v), min(n, t + v))
tau_t[t] <- mean(yt[Vt], na.rm = TRUE)
}
# Remainder component
rt <- yt - tau_t
return(list(seasonal = st, trend = tau_t, remainder = rt))
}
# Define a function to perform grid search for optimizing s.window and t.window
optimize_parameters <- function(xt, p, s_window_range, t_window_range) {
best_params <- list(s_window = NA, t_window = NA)
min_var <- Inf # Initialize minimum variance as infinity
# Grid search
for (s_w in s_window_range) {
for (t_w in t_window_range) {
components <- compute_components(xt, p, s_w, t_w)
var_remainder <- var(components$remainder, na.rm = TRUE)
# Update best parameters if current variance is lower
if (var_remainder < min_var) {
min_var <- var_remainder
best_params$s_window <- s_w
best_params$t_window <- t_w
}
}
}
return(best_params)
}
plot_decomposition <- function(xt, components, main_title) {
time_index <- time(xt)
data <- data.frame(Time = 1:length(xt),
Original = as.numeric(xt),
Seasonal = components$seasonal,
Trend = components$trend,
Remainder = components$remainder)
p1 <- ggplot(data, aes(x = Time, y = Original)) +
geom_line() +
ggtitle(paste(main_title, '- Original Time Series')) +
theme_minimal()
p2 <- ggplot(data, aes(x = Time, y = Seasonal)) +
geom_line() +
ggtitle('Seasonal Component') +
theme_minimal()
p3 <- ggplot(data, aes(x = Time, y = Trend)) +
geom_line() +
ggtitle('Trend Component') +
theme_minimal()
p4 <- ggplot(data, aes(x = Time, y = Remainder)) +
geom_line() +
ggtitle('Remainder Component') +
theme_minimal()
# Display the plots
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 1)
}
# Parameter ranges for grid search
s_window_range <- seq(3, 35, 2)
t_window_range <- seq(3, 35, 2)
# Optimize parameters and plot decomposition for each dataset
datasets <- list(austres = austres, co2 = co2, nottem = nottem)
p_values <- list(austres = 4, co2 = 12, nottem = 12)
for (data_name in names(datasets)) {
xt <- datasets[[data_name]]
p <- p_values[[data_name]]
best_params <- optimize_parameters(xt, p, s_window_range, t_window_range)
components <- compute_components(xt, p, best_params$s_window, best_params$t_window)
plot_decomposition(xt, components, paste(data_name, '- Optimal Decomposition'))
}
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
# Define a function to plot the autocorrelation of the original dataset and each component
plot_autocorrelation <- function(xt, components, title) {
# Original Series
acf(xt, main = paste(title, '- ACF of Original Series'))
# Seasonal Component
acf(components$seasonal, main = paste(title, ' - ACF of Seasonal Component'))
# Trend Component
acf(components$trend, main = paste(title, ' - ACF of Trend Component'))
# Remainder Component
acf(components$remainder, main = paste(title, ' - ACF of Remainder Component'))
}
# Plot autocorrelation for each dataset using the previously computed components
for (data_name in names(datasets)) {
xt <- datasets[[data_name]]
p <- p_values[[data_name]]
# Optimize parameters
best_params <- optimize_parameters(xt, p, s_window_range, t_window_range)
# Compute components using optimized parameters
components <- compute_components(xt, p, best_params$s_window, best_params$t_window)
# Plot autocorrelation
plot_autocorrelation(xt, components, data_name)
}
Regarding which component is the most dominant for every dataset:
Austres: The seasonal component is most dominant in the dataset, and appears to have a high correlation to the acf of the original data. The trend component has strongest correlations to the original time series at low values of lag, while the remainder component doesn’t appear to explain much of the variance in the dataset.
Co2: The seasonal component has a high correlation with the original acf for most levels of lag. The trend component and the remainder component is less dominant or this dataset. Correlation levels for these two components have a cyclical distribution for higher levels of lag.
Nottem: The distribution for the trend component and the remainder component appears to be similar to the orginial acf for this dataset. The seasonal component appears to be less dominant for this dataset ## Task d)
comment on the seasonal components.
We can see that the seasonal periods are daily, weekly and yearly.